-
Notifications
You must be signed in to change notification settings - Fork 14.7k
Revert "[VectorCombine] Shrink loads used in shufflevector rebroadcasts" #151960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang Author: Simon Pilgrim (RKSimon) ChangesReverts llvm/llvm-project#128938 while a crash regression is investigated Patch is 38.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151960.diff 7 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e73657e30d884..49ebae6fc7013 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fef0934010df4..6345b18b809a6 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,7 +16,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -30,16 +29,13 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
-#include <optional>
#include <queue>
#include <set>
-#include <tuple>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -141,7 +137,6 @@ class VectorCombine {
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
- bool shrinkLoadForShuffles(Instruction &I);
void replaceValue(Value &Old, Value &New) {
LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3866,126 +3861,6 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
-// Attempt to shrink loads that are only used by shufflevector instructions.
-bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
- auto *OldLoad = dyn_cast<LoadInst>(&I);
- if (!OldLoad || !OldLoad->isSimple())
- return false;
-
- auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
- if (!OldLoadTy)
- return false;
-
- unsigned const OldNumElements = OldLoadTy->getNumElements();
-
- // Search all uses of load. If all uses are shufflevector instructions, and
- // the second operands are all poison values, find the minimum and maximum
- // indices of the vector elements referenced by all shuffle masks.
- // Otherwise return `std::nullopt`.
- using IndexRange = std::pair<int, int>;
- auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
- IndexRange OutputRange = IndexRange(OldNumElements, -1);
- for (llvm::Use &Use : I.uses()) {
- // Ensure all uses match the required pattern.
- User *Shuffle = Use.getUser();
- ArrayRef<int> Mask;
-
- if (!match(Shuffle,
- m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
- return std::nullopt;
-
- // Ignore shufflevector instructions that have no uses.
- if (Shuffle->use_empty())
- continue;
-
- // Find the min and max indices used by the shufflevector instruction.
- for (int Index : Mask) {
- if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
- OutputRange.first = std::min(Index, OutputRange.first);
- OutputRange.second = std::max(Index, OutputRange.second);
- }
- }
- }
-
- if (OutputRange.second < OutputRange.first)
- return std::nullopt;
-
- return OutputRange;
- };
-
- // Get the range of vector elements used by shufflevector instructions.
- if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
- unsigned const NewNumElements = Indices->second + 1u;
-
- // If the range of vector elements is smaller than the full load, attempt
- // to create a smaller load.
- if (NewNumElements < OldNumElements) {
- IRBuilder Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- // Calculate costs of old and new ops.
- Type *ElemTy = OldLoadTy->getElementType();
- FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
- Value *PtrOp = OldLoad->getPointerOperand();
-
- InstructionCost OldCost = TTI.getMemoryOpCost(
- Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
- OldLoad->getPointerAddressSpace(), CostKind);
- InstructionCost NewCost =
- TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
- OldLoad->getPointerAddressSpace(), CostKind);
-
- using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
- SmallVector<UseEntry, 4u> NewUses;
-
- for (llvm::Use &Use : I.uses()) {
- auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
- ArrayRef<int> OldMask = Shuffle->getShuffleMask();
-
- // Create entry for new use.
- NewUses.push_back({Shuffle, OldMask});
-
- // Update costs.
- OldCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
- OldLoadTy, OldMask, CostKind);
- NewCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
- NewLoadTy, OldMask, CostKind);
- }
-
- LLVM_DEBUG(
- dbgs() << "Found a load used only by shufflevector instructions: "
- << I << "\n OldCost: " << OldCost
- << " vs NewCost: " << NewCost << "\n");
-
- if (OldCost < NewCost || !NewCost.isValid())
- return false;
-
- // Create new load of smaller vector.
- auto *NewLoad = cast<LoadInst>(
- Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
- NewLoad->copyMetadata(I);
-
- // Replace all uses.
- for (UseEntry &Use : NewUses) {
- ShuffleVectorInst *Shuffle = Use.first;
- std::vector<int> &NewMask = Use.second;
-
- Builder.SetInsertPoint(Shuffle);
- Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
- Value *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewLoadTy), NewMask);
-
- replaceValue(*Shuffle, *NewShuffle);
- }
-
- return true;
- }
- }
- return false;
-}
-
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@@ -4062,9 +3937,6 @@ bool VectorCombine::run() {
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
break;
- case Instruction::Load:
- MadeChange |= shrinkLoadForShuffles(I);
- break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 9218cc2d019f8..85f6fceb5bdbe 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 0c2346e616e36..977da754ec5a7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,7 +252,8 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -340,7 +341,8 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index eacc40bfa9b53..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index eddfc57a7d256..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
-; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
-; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT: ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT: ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
deleted file mode 100644
index 467c20c5da0c2..0000000000000
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ /dev/null
@@ -1,392 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
-
-define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
-; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
-; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half>...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Simon Pilgrim (RKSimon) ChangesReverts llvm/llvm-project#128938 while a crash regression is investigated Patch is 38.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151960.diff 7 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e73657e30d884..49ebae6fc7013 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fef0934010df4..6345b18b809a6 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,7 +16,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -30,16 +29,13 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
-#include <optional>
#include <queue>
#include <set>
-#include <tuple>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -141,7 +137,6 @@ class VectorCombine {
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
- bool shrinkLoadForShuffles(Instruction &I);
void replaceValue(Value &Old, Value &New) {
LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3866,126 +3861,6 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
-// Attempt to shrink loads that are only used by shufflevector instructions.
-bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
- auto *OldLoad = dyn_cast<LoadInst>(&I);
- if (!OldLoad || !OldLoad->isSimple())
- return false;
-
- auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
- if (!OldLoadTy)
- return false;
-
- unsigned const OldNumElements = OldLoadTy->getNumElements();
-
- // Search all uses of load. If all uses are shufflevector instructions, and
- // the second operands are all poison values, find the minimum and maximum
- // indices of the vector elements referenced by all shuffle masks.
- // Otherwise return `std::nullopt`.
- using IndexRange = std::pair<int, int>;
- auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
- IndexRange OutputRange = IndexRange(OldNumElements, -1);
- for (llvm::Use &Use : I.uses()) {
- // Ensure all uses match the required pattern.
- User *Shuffle = Use.getUser();
- ArrayRef<int> Mask;
-
- if (!match(Shuffle,
- m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
- return std::nullopt;
-
- // Ignore shufflevector instructions that have no uses.
- if (Shuffle->use_empty())
- continue;
-
- // Find the min and max indices used by the shufflevector instruction.
- for (int Index : Mask) {
- if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
- OutputRange.first = std::min(Index, OutputRange.first);
- OutputRange.second = std::max(Index, OutputRange.second);
- }
- }
- }
-
- if (OutputRange.second < OutputRange.first)
- return std::nullopt;
-
- return OutputRange;
- };
-
- // Get the range of vector elements used by shufflevector instructions.
- if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
- unsigned const NewNumElements = Indices->second + 1u;
-
- // If the range of vector elements is smaller than the full load, attempt
- // to create a smaller load.
- if (NewNumElements < OldNumElements) {
- IRBuilder Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- // Calculate costs of old and new ops.
- Type *ElemTy = OldLoadTy->getElementType();
- FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
- Value *PtrOp = OldLoad->getPointerOperand();
-
- InstructionCost OldCost = TTI.getMemoryOpCost(
- Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
- OldLoad->getPointerAddressSpace(), CostKind);
- InstructionCost NewCost =
- TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
- OldLoad->getPointerAddressSpace(), CostKind);
-
- using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
- SmallVector<UseEntry, 4u> NewUses;
-
- for (llvm::Use &Use : I.uses()) {
- auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
- ArrayRef<int> OldMask = Shuffle->getShuffleMask();
-
- // Create entry for new use.
- NewUses.push_back({Shuffle, OldMask});
-
- // Update costs.
- OldCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
- OldLoadTy, OldMask, CostKind);
- NewCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
- NewLoadTy, OldMask, CostKind);
- }
-
- LLVM_DEBUG(
- dbgs() << "Found a load used only by shufflevector instructions: "
- << I << "\n OldCost: " << OldCost
- << " vs NewCost: " << NewCost << "\n");
-
- if (OldCost < NewCost || !NewCost.isValid())
- return false;
-
- // Create new load of smaller vector.
- auto *NewLoad = cast<LoadInst>(
- Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
- NewLoad->copyMetadata(I);
-
- // Replace all uses.
- for (UseEntry &Use : NewUses) {
- ShuffleVectorInst *Shuffle = Use.first;
- std::vector<int> &NewMask = Use.second;
-
- Builder.SetInsertPoint(Shuffle);
- Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
- Value *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewLoadTy), NewMask);
-
- replaceValue(*Shuffle, *NewShuffle);
- }
-
- return true;
- }
- }
- return false;
-}
-
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@@ -4062,9 +3937,6 @@ bool VectorCombine::run() {
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
break;
- case Instruction::Load:
- MadeChange |= shrinkLoadForShuffles(I);
- break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 9218cc2d019f8..85f6fceb5bdbe 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 0c2346e616e36..977da754ec5a7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,7 +252,8 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -340,7 +341,8 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index eacc40bfa9b53..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index eddfc57a7d256..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
-; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
-; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT: ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT: ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
deleted file mode 100644
index 467c20c5da0c2..0000000000000
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ /dev/null
@@ -1,392 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
-
-define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
-; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
-; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half>...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: Simon Pilgrim (RKSimon) ChangesReverts llvm/llvm-project#128938 while a crash regression is investigated Patch is 38.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151960.diff 7 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e73657e30d884..49ebae6fc7013 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fef0934010df4..6345b18b809a6 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,7 +16,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -30,16 +29,13 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
-#include <optional>
#include <queue>
#include <set>
-#include <tuple>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -141,7 +137,6 @@ class VectorCombine {
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
- bool shrinkLoadForShuffles(Instruction &I);
void replaceValue(Value &Old, Value &New) {
LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3866,126 +3861,6 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
-// Attempt to shrink loads that are only used by shufflevector instructions.
-bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
- auto *OldLoad = dyn_cast<LoadInst>(&I);
- if (!OldLoad || !OldLoad->isSimple())
- return false;
-
- auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
- if (!OldLoadTy)
- return false;
-
- unsigned const OldNumElements = OldLoadTy->getNumElements();
-
- // Search all uses of load. If all uses are shufflevector instructions, and
- // the second operands are all poison values, find the minimum and maximum
- // indices of the vector elements referenced by all shuffle masks.
- // Otherwise return `std::nullopt`.
- using IndexRange = std::pair<int, int>;
- auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
- IndexRange OutputRange = IndexRange(OldNumElements, -1);
- for (llvm::Use &Use : I.uses()) {
- // Ensure all uses match the required pattern.
- User *Shuffle = Use.getUser();
- ArrayRef<int> Mask;
-
- if (!match(Shuffle,
- m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
- return std::nullopt;
-
- // Ignore shufflevector instructions that have no uses.
- if (Shuffle->use_empty())
- continue;
-
- // Find the min and max indices used by the shufflevector instruction.
- for (int Index : Mask) {
- if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
- OutputRange.first = std::min(Index, OutputRange.first);
- OutputRange.second = std::max(Index, OutputRange.second);
- }
- }
- }
-
- if (OutputRange.second < OutputRange.first)
- return std::nullopt;
-
- return OutputRange;
- };
-
- // Get the range of vector elements used by shufflevector instructions.
- if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
- unsigned const NewNumElements = Indices->second + 1u;
-
- // If the range of vector elements is smaller than the full load, attempt
- // to create a smaller load.
- if (NewNumElements < OldNumElements) {
- IRBuilder Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- // Calculate costs of old and new ops.
- Type *ElemTy = OldLoadTy->getElementType();
- FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
- Value *PtrOp = OldLoad->getPointerOperand();
-
- InstructionCost OldCost = TTI.getMemoryOpCost(
- Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
- OldLoad->getPointerAddressSpace(), CostKind);
- InstructionCost NewCost =
- TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
- OldLoad->getPointerAddressSpace(), CostKind);
-
- using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
- SmallVector<UseEntry, 4u> NewUses;
-
- for (llvm::Use &Use : I.uses()) {
- auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
- ArrayRef<int> OldMask = Shuffle->getShuffleMask();
-
- // Create entry for new use.
- NewUses.push_back({Shuffle, OldMask});
-
- // Update costs.
- OldCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
- OldLoadTy, OldMask, CostKind);
- NewCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
- NewLoadTy, OldMask, CostKind);
- }
-
- LLVM_DEBUG(
- dbgs() << "Found a load used only by shufflevector instructions: "
- << I << "\n OldCost: " << OldCost
- << " vs NewCost: " << NewCost << "\n");
-
- if (OldCost < NewCost || !NewCost.isValid())
- return false;
-
- // Create new load of smaller vector.
- auto *NewLoad = cast<LoadInst>(
- Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
- NewLoad->copyMetadata(I);
-
- // Replace all uses.
- for (UseEntry &Use : NewUses) {
- ShuffleVectorInst *Shuffle = Use.first;
- std::vector<int> &NewMask = Use.second;
-
- Builder.SetInsertPoint(Shuffle);
- Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
- Value *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewLoadTy), NewMask);
-
- replaceValue(*Shuffle, *NewShuffle);
- }
-
- return true;
- }
- }
- return false;
-}
-
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@@ -4062,9 +3937,6 @@ bool VectorCombine::run() {
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
break;
- case Instruction::Load:
- MadeChange |= shrinkLoadForShuffles(I);
- break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 9218cc2d019f8..85f6fceb5bdbe 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 0c2346e616e36..977da754ec5a7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,7 +252,8 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -340,7 +341,8 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index eacc40bfa9b53..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index eddfc57a7d256..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
-; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
-; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT: ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT: ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
deleted file mode 100644
index 467c20c5da0c2..0000000000000
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ /dev/null
@@ -1,392 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
-
-define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
-; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
-; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half>...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Transforms/Vectorize/VectorCombine.cpp llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll llvm/test/Transforms/VectorCombine/X86/load-widening.ll llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
…ts" (llvm#151960) Reverts llvm#128938 while a crash regression is investigated
Reverts #128938 while a crash regression is investigated